from bs4 import BeautifulSoup
from urllib.request import urlopen
import random
import requests
import re
import os

# 百度上根据链接随机访问url
def RecursiveRequests(base_url,history,num):
    while len(history):
        url = base_url + history[-1]
        response = urlopen(url)
        html =response.read().decode('utf-8')
        soup = BeautifulSoup(html,'lxml')
        print(response.status,'\t',soup.find('h1').get_text(),'\t',f'url={url}')  # 已解码可显示中文
        sub_urls = soup.find_all(
            name='a',
            attrs={
                'target':'_blank',
                'href':re.compile('/item/(%.{2})+$')  # 一般中文都是以%和2位字母数字组成的字符串表示，+表示匹配()规则至少一次，$表示结束
            })
        if len(sub_urls):
            new_his = random.choice(sub_urls)['href']
            if base_url in new_his:# [备注]:这里有时候his添加的就是由base_url开头的，因此要把这一部分减掉
                new_his = new_his[len(base_url):]
            history.append(new_his)
        else:
            history.pop()  # 回溯思想
        num-=1
        if not num:
            break

base_url = 'https://baike.baidu.com'
history = ['/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711']
num = 20  # 访问次数
RecursiveRequests(base_url,history,num)

